pkgs <- c("pwr", "dplyr", "ggplot2", "tidyr", "knitr", "kableExtra", "shiny")
for (i in seq_along(pkgs)) {library(pkgs[i], character.only = TRUE)}## Warning: package 'pwr' was built under R version 4.4.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
## Warning: package 'shiny' was built under R version 4.4.1
# DnD character generation option: roll 4d6, take the top 3
# simulating to check distribution of that approach
# function to simulate dice rolls ==============================================
# inputs:
# - counts: vector for number of rolls
# - sides: vector of sides per die (aligned with counts)
# - seed: number to set random number generator seed for replication
# default (NULL) sets no RNG seed
# - drop.lowest.n: vector to specify if dropping the lowest 'n' results
# default (NULL) will drop none for any roll
# a single number will apply that number to any roll
# a vector of
# output:
# - list with 1 entry per roll set; each entry has the final rolls
# (excluding any lowest dropped rolls if specified) and sum of the set
# the final list entry is the sum across all roll sets
roll <- function(counts, sides, seed = NULL, drop.lowest.n = NULL) {
if (length(counts) != length(sides)) {stop("counts and sides lengths don't match")}
if (!all(sides %in% c(4, 6, 8, 10, 100, 12, 20))) {
stop("sides can only contain 4, 6, 8, 10, 100 (%), 12, 20")}
set.seed(seed)
pct_idx <- sides == 100
if (is.null(drop.lowest.n)){d.l.n <- rep(0, length(counts))
} else {
if (any(drop.lowest.n >= counts)) {stop("drop.lowest.n must be at most counts - 1")}
if (length(drop.lowest.n) == 1L) {d.l.n <- rep(drop.lowest.n, length(counts))
} else {
if (length(drop.lowest.n) != length(counts)) {
stop("drop.lowest.n must be one of: same length as counts, a single number, or NULL")
}
d.l.n <- drop.lowest.n
}
}
# check feasibility of dropping lowest 'n' rolls in each case
if (any(d.l.n < 0)) {stop("drop.lowest.n cannot be negative")}
roll_set <-
lapply(1:length(sides), function(i) {
if (pct_idx[i]) {
ones <- sample(0:9, size = counts[i], replace = TRUE)
tens <- sample(0:9 * 10, size = counts[i], replace = TRUE)
rslt <- tens + ones
rslt[rslt == 0] <- 100
if (d.l.n[i] > 0) {rslt <- (rslt[order(rslt)])[-c(1:d.l.n[i])]}
rslt <- list("rolls" = rslt, "sum" = sum(rslt))
rslt
} else {
rslt <- sample(1:sides[i], size = counts[i], replace = TRUE)
if (d.l.n[i] > 0) {rslt <- (rslt[order(rslt)])[-c(1:d.l.n[i])]}
rslt <- list("rolls" = rslt, "sum" = sum(rslt))
rslt
}
})
names(roll_set) <- paste0(counts, "d", ifelse(pct_idx, "%", sides))
sums <-
vapply(seq_along(roll_set), function(i){roll_set[[i]]$sum}, numeric(1L))
roll_set[[length(roll_set) + 1]] <- c("ovr total" = sum(sums))
roll_set
}
# just curious: impact of 3d4 minus 2 versus 1d10 (same min/max)
# data.frame(setup = c("3d4 - 2 vs. 1d10"),
# sim_seed = 1:1e4) %>%
# mutate(
# results = lapply(1:nrow(.), function(i){
# roll(counts = c(3, 1), sides = c(4, 10), seed = .$sim_seed[i])
# })
# ) %>%
# mutate(
# res_3d4mns2 = vapply(1:nrow(.), function(i) {
# .$results[[i]]$`3d4`$sum - 2}, numeric(1L)),
# res_1d10 = vapply(1:nrow(.), function(i) {
# .$results[[i]]$`1d10`$sum}, numeric(1L)),
# ) %>%
# pivot_longer(cols = starts_with("res_"),
# names_to = "roll", names_prefix = "res_",
# values_to = "result") %>%
# ggplot(aes(x = result, color = roll)) +
# geom_density() +
# scale_x_continuous(breaks = 1:10, minor_breaks = FALSE)
# unsurprising - 3d4 - 2 concentrates results closer to center of results range
# (esp. 4-7); 1d10 is more 'swingy'Chi-square Goodness of Fit (GoF) Test
Bayesian multinomial modeling
Initial rolls (white, grey, black, most green rolls): FanRoll velvet dice tray with leather backing (‘rainbow watercolor’); 10” x 10” dimensions, moderate padding
Later rolls (rest of green, teal[1/2], mushroom, aubergine, ocean, ice rolls): GameGenic Tokens’ Lair magnetic dice tray; 6.5” x 3.75” dimensions, less padding
sets <- c("white", "grey", "black", "green", "teal1", "teal2", "mushroom", "aubergine", "ocean", "ice")
dice <- paste0("d", c(4, 6, 8, 10, "%", 12, 20))
alpha <- 0.05 # P(reject null[fair die] | actually fair die)
pwr <- 0.80 # P(don't reject null | alt [stated bias exists] is true)
side_levels <- c(0:20, seq(30, 90, by = 10))
# null hypothesis: equal probability for each side
# alternative hypothesis: one side is 'preferred' (100 * alt_pref)% more
# (by dice structure, that means opposite side dice is 'disliked' by same %)
# (I'm assuming this holds true for d4 too)
alt_pref <- 0.33333
dice_setup <-
data.frame(
set = rep(sets, each = length(dice)),
die = rep(dice, times = length(sets))
) %>%
# only d20 for teal1/teal2/mushroom 'set's
filter(!((grepl("teal", set) | grepl("mushroom", set)) &
die %in% paste0("d", c(4, 6, 8, 10, "%", 12)))) %>%
mutate(
sides_char = ifelse(grepl("\\%", die), 10, sub("d", "", die)),
sides = as.numeric(sides_char)
) %>%
select(-sides_char) %>%
# note: need separate mutate() calls when the previous mutate-created
# content is needed to create subsequent multi-step entry
mutate(
null = lapply(1:nrow(.), function(i) {
rep(1 / .$sides[i], times = .$sides[i])
}),
alt = lapply(1:nrow(.), function(i) {
eq_prob <- 1 / .$sides[i]
prefr <- (1 + alt_pref) * eq_prob
dislk <- (1 - alt_pref) * eq_prob
rest <- (1 - (prefr + dislk)) / (.$sides[i] - 2)
c(prefr, dislk, rep(rest, .$sides[i] - 2))
})
) %>%
mutate(
effect_size = vapply(1:nrow(.), function(i) {
ES.w1(P0 = .$null[[i]], P1 = .$alt[[i]])
}, numeric(1L))
) %>%
mutate(
sample_size = vapply(1:nrow(.), function(i) {
N_ <-
pwr.chisq.test(w = .$effect_size[i],
N = NULL,
df = .$sides[i] - 1,
sig.level = alpha,
power = pwr)$N
ceiling(N_) # round up
}, numeric(1L))
)Notes on data collection:
Rolls must land in the dice tray; any roll outside it will be rejected (this includes rolls hitting the edge of the dice tray and landing outside the tray, rolls landing on the dice tray but then bouncing out, and rolls landing outside the dice tray but then bouncing in; rolls hitting the dice tray side but no other object and landing inside the tray will be included)
Rolls must be spun in the air; any roll (subjectively) deemed to have been insufficiently ‘spun’ will be rejected (i.e. it felt like I basically just dropped the dice instead of spinning it in the air)
Dice are rolled one at a time; each result is recorded before picking up the die to conduct the next roll
If the die doesn’t come to rest fully on one face (i.e. one edge of the die is resting against the side of the dice tray), the die must be (subjectively) deemed completely unambiguous as to which side would be facing up if the tray side were not there for the roll to be recorded
If the die is rotated while picking it up and before the roll result is noted, the die should be rolled twice without recording the results before resuming recording of roll results (to potentially reduce the risk of bias in the recorded result, should one roll have an association with the prior one)
The roll technique is described below:
The die starts sitting upright in the open right hand, only
slightly cupped with the die generally resting around the base of the
middle and ring fingers
The roll is an upward and ‘counterclockwise’ (wrist rotates the
pinky to the right) twist, allowing the die to roll from the hand;
generally this means the die rolls off the pinky, or occasionally the
ring finger
After each roll is recorded, the die is simply picked up and re-rolled; no deliberate care is made to randomize or control the up-facing side of the die (generally it seems to differ from the previous roll’s result due to the picking up action)
# quick plotting function
plot_result <- function(df, fill_color = "dodgerblue2") {
set_die <- deparse(substitute(df)) # store df name as text
set_ <- sub("^([a-z]+)_d[0-9|%]{1,2}_df$", "\\1", set_die)
die_ <- sub("^[a-z]+_(d[0-9|%]{1,2})_df$", "\\1", set_die)
avg_count <- (1 / length(unique(df$result))) * nrow(df)
ggplot(df, aes(x = result)) +
geom_vline(xintercept = mean(df$result), linetype = "solid") +
geom_hline(yintercept = avg_count, linetype = "solid") +
geom_bar(fill = fill_color, color = "black") +
labs(title = paste("Plot of", set_, die_, "results"),
subtitle = paste("n = ", nrow(df)),
caption = paste0("Horizontal line indicates expected count (",
sprintf("%.1f", avg_count),
"); Vertical line indicates mean value (",
sprintf("%.1f", mean(df$result)),")")) +
scale_x_continuous(breaks = unique(df$result)) +
theme_bw()
}
set_fill_map <-
c("white" = "ivory3", "grey" = "grey40",
"black" = "grey10", "green" = "green4",
"teal1" = "darkcyan", "teal2" = "steelblue4",
"mushroom" = "gold3", "aubergine" = "#614051",
"ocean" = "cyan2", "ice" = "powderblue")# loading the above dice-roll data
dice_roll_df <-
read.csv(file = "dice_rolls.csv")
# function to identify streaks in the data
find_run_lengths <- function(x, streaks.only = TRUE){
run_len_encoding <- rle(x)
rle_df <- data.frame(value = run_len_encoding$values,
length = run_len_encoding$lengths)
if (streaks.only) {
rle_df[rle_df$length > 1L,]
} else {rle_df}
}
# function for multinomial proportion modelling
rdirichlet <- # from the LearnBayes package
function (n, par) {
k = length(par)
z = array(0, dim = c(n, k))
s = array(0, dim = c(n, 1))
for (i in 1:k) {
z[, i] = rgamma(n, shape = par[i])
s = s + z[, i]
}
for (i in 1:k) {
z[, i] = z[, i] / s
}
return(z)
}dice_roll_df_ <-
dice_roll_df %>%
mutate(set = factor(set, levels = sets),
die = factor(die, levels = paste0("d", c(4, 6, 8, 10, "%", 12, 20)))
) %>%
nest_by(set, die) %>%
mutate(
die_val = sub("d", "", die),
die_val = ifelse(die_val == "%", 100, as.numeric(die_val)),
n_obs = length(data$result),
expected_count_per_side =
case_when(die_val == 100 ~ n_obs / 10,
TRUE ~ n_obs / die_val),
mean_ = mean(data$result),
expected_mean =
case_when(die_val == 100 ~ mean(0:9 * 10),
die_val == 10 ~ mean(1:10),
TRUE ~ mean(1:die_val)),
sd_ = sd(data$result),
expected_sd =
case_when(die_val == 100 ~ sd(0:9 * 10),
die_val == 10 ~ sd(1:10),
TRUE ~ sd(1:die_val)),
result_tbl = list(table(data$result)),
chisq_pval = chisq.test(result_tbl)$p.value,
chisq_stat_sig_0.05 = chisq_pval <= 0.05,
roll_streaks = list(find_run_lengths(data$result, streaks.only = TRUE)),
roll_streak_tbl = list(table(roll_streaks))
)# checking which dice per side across sets have largest p-values
fairest_dice_top4 <-
dice_roll_df_ %>%
group_by(die) %>%
arrange(desc(chisq_pval)) %>%
mutate(p_val_rank = 1:n()) %>%
ungroup() %>%
filter(p_val_rank %in% c(1:4)) %>%
select(set, die, chisq_pval, p_val_rank) %>%
arrange(die, p_val_rank, set)
fairest_dice_bot3 <-
dice_roll_df_ %>%
group_by(die) %>%
arrange(desc(chisq_pval)) %>%
mutate(p_val_rank = 1:n(), max_rank = n()) %>%
ungroup() %>%
mutate(bad = p_val_rank == max_rank - 2,
worse = p_val_rank == max_rank - 1,
worst = p_val_rank == max_rank) |>
filter(bad | worse | worst) %>%
select(set, die, chisq_pval, p_val_rank) %>%
arrange(die, desc(p_val_rank), set)
# table formatting
format_table <- function(x){
x |>
kbl(format = "html") |>
kable_styling(full_width = FALSE,
bootstrap_options = c("striped", "bordered"))
}
noquote("Fairest Dice - Top 4")## [1] Fairest Dice - Top 4
| set | die | chisq_pval | p_val_rank |
|---|---|---|---|
| grey | d4 | 0.900 | 1 |
| aubergine | d4 | 0.881 | 2 |
| ocean | d4 | 0.641 | 3 |
| black | d4 | 0.632 | 4 |
| aubergine | d6 | 0.802 | 1 |
| green | d6 | 0.612 | 2 |
| grey | d6 | 0.561 | 3 |
| black | d6 | 0.253 | 4 |
| aubergine | d8 | 0.982 | 1 |
| grey | d8 | 0.910 | 2 |
| white | d8 | 0.411 | 3 |
| black | d8 | 0.386 | 4 |
| grey | d10 | 0.829 | 1 |
| white | d10 | 0.804 | 2 |
| black | d10 | 0.793 | 3 |
| green | d10 | 0.556 | 4 |
| grey | d% | 0.624 | 1 |
| white | d% | 0.363 | 2 |
| black | d% | 0.305 | 3 |
| aubergine | d% | 0.130 | 4 |
| white | d12 | 0.507 | 1 |
| green | d12 | 0.393 | 2 |
| ice | d12 | 0.387 | 3 |
| black | d12 | 0.321 | 4 |
| aubergine | d20 | 0.097 | 1 |
| ice | d20 | 0.002 | 2 |
| grey | d20 | 0.001 | 3 |
| teal2 | d20 | 0.000 | 4 |
## [1] Fairest Dice - Sets by # Appearances and Average Ranking in Top 4
fairest_dice_top4 |>
group_by(set) |>
summarize(top4_ct = n(),
avg_rank = mean(p_val_rank) |> round(3),
.groups = "drop") |>
arrange(desc(top4_ct), avg_rank) |>
format_table()| set | top4_ct | avg_rank |
|---|---|---|
| grey | 6 | 1.833 |
| black | 6 | 3.667 |
| aubergine | 5 | 1.800 |
| white | 4 | 2.000 |
| green | 3 | 2.667 |
| ice | 2 | 2.500 |
| ocean | 1 | 3.000 |
| teal2 | 1 | 4.000 |
# grey, then aubergine, then black, then white, then green
# (descending order of 'fairness')
# aside from d20s, no real concerns of fairness for any observations
# aubergine d20 is astonishingly 'fair'!
# surprised the ice d20 is second fairest...
noquote("Least Fair Dice - Bottom 3")## [1] Least Fair Dice - Bottom 3
| set | die | chisq_pval | p_val_rank |
|---|---|---|---|
| ice | d4 | 0.588 | 7 |
| white | d4 | 0.597 | 6 |
| green | d4 | 0.614 | 5 |
| ocean | d6 | 0.024 | 7 |
| white | d6 | 0.068 | 6 |
| ice | d6 | 0.187 | 5 |
| green | d8 | 0.040 | 7 |
| ice | d8 | 0.170 | 6 |
| ocean | d8 | 0.351 | 5 |
| ocean | d10 | 0.051 | 7 |
| ice | d10 | 0.070 | 6 |
| aubergine | d10 | 0.133 | 5 |
| ice | d% | 0.004 | 7 |
| green | d% | 0.055 | 6 |
| ocean | d% | 0.105 | 5 |
| ocean | d12 | 0.014 | 7 |
| aubergine | d12 | 0.079 | 6 |
| grey | d12 | 0.287 | 5 |
| green | d20 | 0.000 | 10 |
| white | d20 | 0.000 | 9 |
| black | d20 | 0.000 | 8 |
## [1] Least Fair Dice, Arranged by p-value
fairest_dice_bot3 |>
arrange(chisq_pval) |>
mutate(chisq_pval = round(chisq_pval, 3)) |>
format_table()| set | die | chisq_pval | p_val_rank |
|---|---|---|---|
| green | d20 | 0.000 | 10 |
| white | d20 | 0.000 | 9 |
| black | d20 | 0.000 | 8 |
| ice | d% | 0.004 | 7 |
| ocean | d12 | 0.014 | 7 |
| ocean | d6 | 0.024 | 7 |
| green | d8 | 0.040 | 7 |
| ocean | d10 | 0.051 | 7 |
| green | d% | 0.055 | 6 |
| white | d6 | 0.068 | 6 |
| ice | d10 | 0.070 | 6 |
| aubergine | d12 | 0.079 | 6 |
| ocean | d% | 0.105 | 5 |
| aubergine | d10 | 0.133 | 5 |
| ice | d8 | 0.170 | 6 |
| ice | d6 | 0.187 | 5 |
| grey | d12 | 0.287 | 5 |
| ocean | d8 | 0.351 | 5 |
| ice | d4 | 0.588 | 7 |
| white | d4 | 0.597 | 6 |
| green | d4 | 0.614 | 5 |
# green d20 is clearly not random, nor is white d20
# for other-sided dice even the 'worst' performers are solid except
# 'ice' d10 slightly a bit biased against '0', EV is ok (5.4 vs. 'fair' 5.5)
# 'ice' d% is very unlikely to roll 00, otherwise looks great
# 'ocean' d12 ... expected value (6.3) isn't wildly off 'random' EV (6.5)
# 'ocean' d6 ... expected value (3.5) exactly matches 'random' EV,
# just more 3s than 1s # observed vs. expected mean / SD
dice_mean_df_tbl <-
dice_roll_df_ %>%
select(set, die, observed_mean = mean_, expected_mean) %>%
mutate("pct_diff" = (observed_mean - expected_mean) / expected_mean * 100) %>%
pivot_longer(cols = ends_with("_mean"), names_to = "mean", values_to = "value") %>%
mutate(type = sub("_mean", "", mean), var = "mean") %>%
select(-mean)
dice_sd_df_tbl <-
dice_roll_df_ %>%
select(set, die, observed_sd = sd_, expected_sd) %>%
mutate("pct_diff" = (observed_sd - expected_sd) / expected_sd * 100) %>%
pivot_longer(cols = ends_with("_sd"), names_to = "sd", values_to = "value") %>%
mutate(type = sub("_sd", "", sd), var = "sd") %>%
select(-sd)
dice_mean_sd_tbl <-
full_join(dice_mean_df_tbl, dice_sd_df_tbl,
by = c("set", "die", "var", "type", "value", "pct_diff")) %>%
mutate(gt_10pct_diff = abs(pct_diff) > 10)
mean_sd_by_set_plots <-
lapply(seq_along(sets), function(i) {
ggplot(data = dice_mean_sd_tbl[dice_mean_sd_tbl$set == sets[i],],
aes(x = type, ymin = 0, ymax = value, y = value, color = gt_10pct_diff)) +
facet_grid(rows = vars(die), cols = vars(var), scales = "free_y") +
geom_pointrange() +
labs(title = "Expected vs. Observed Mean/SD of Dice Rolls",
subtitle = paste(sets[i], "dice"),
color = ">10% diff.\nvs. expected") +
scale_color_manual(values = c("FALSE" = "grey30", "TRUE" = "orange3")) +
theme_bw()
})
# focus on cases of interest
focus_mean_sd_plots <-
dice_mean_sd_tbl %>%
filter(gt_10pct_diff) %>%
nest_by(set, die) %>%
mutate(
plot_ = list(
ggplot(data = data,
aes(x = type, ymin = 0, ymax = value, y = value)) +
facet_wrap(facets = vars(var), scales = "free_y", ncol = 2) +
geom_pointrange(color = "orange3") +
labs(title = "Expected vs. Observed Mean/SD of Dice Rolls",
subtitle = paste(set, die)) +
theme_bw()
)
) %>%
pull(plot_)for (i in seq_along(mean_sd_by_set_plots)) {
if (i == 1) {cat("All Mean/SD Plots\n\n")}
cat("\n\n")
print(mean_sd_by_set_plots[[i]])
}All Mean/SD Plots
for (i in seq_along(focus_mean_sd_plots)) {
if (i == 1) {cat("10+% difference Mean/SD Plots\n\n")}
cat("\n\n")
print(focus_mean_sd_plots[[i]])
}10+% difference Mean/SD Plots
streak_color_map <-
c("1" = "blue4", "2" = "blue2", "3" = "dodgerblue3",
"4" = "firebrick1", "5" = "firebrick", "6" = "red4")
dice_streaks_plots <-
lapply(1:nrow(dice_roll_df_), function(i) {
df_0 <- dice_roll_df_$roll_streak_tbl[[i]]
df_ <- data.frame(value = rep(rownames(df_0), times = ncol(df_0)),
strk_len = rep(colnames(df_0), each = nrow(df_0)))
df_$value <- as.factor(as.integer(df_$value))
df_$count <- as.integer(df_0)
ggplot(df_, aes(x = value, y = count, fill = strk_len)) +
geom_col(position = position_dodge(width = 0.7)) +
labs(title = "Streak length by value",
subtitle = paste(dice_roll_df_$set[i], dice_roll_df_$die[i])) +
scale_fill_manual(values = streak_color_map) +
theme_bw()
})
streak_sim <-
lapply(1:100, function(i) {
roll_sim <- roll(counts = 1851, sides = 20, seed = i)[[1]]$rolls
rle_ <- find_run_lengths(roll_sim, streaks.only = TRUE)
rle_
}) %>%
bind_rows() %>%
table()
streak_sim_df <-
data.frame(value = rep(rownames(streak_sim), times = ncol(streak_sim)),
strk_len = rep(colnames(streak_sim), each = nrow(streak_sim)))
streak_sim_df$value <- as.factor(as.integer(streak_sim_df$value))
streak_sim_df$count <- as.integer(streak_sim)
streak_sim_plot <-
ggplot(streak_sim_df, aes(x = value, y = count, fill = strk_len)) +
geom_col(position = position_dodge(width = 0.7)) +
labs(title = "Streak length by value",
subtitle = "Simulated d20 rolls",
caption = "1,851 rolls simulated 100 times") +
scale_fill_manual(values = streak_color_map) +
theme_bw()
dice_streak_focus <- c(7, 14, 21, 28, 29, 30, 31, 38, 40, 45, 52)
focused_streak_plots <-
lapply(seq_along(dice_streak_focus), function(i){
dice_streaks_plots[[dice_streak_focus[i]]]
})Simulated d20 rolls for comparison
Selected Streak Plots
Note: The ‘aubergine’ d20 is (amazingly) not statistically significant; it’s included for comparison against the other d20s, all of which were stat. sig. in the chi-squared test.
# plots of (Chi square HoV) stat-sig dice - side-by-side for same-die sets
comparison_plots_df <-
dice_roll_df_ %>%
filter(chisq_stat_sig_0.05 |
(set == "aubergine" & die == "d20")) %>%
nest_by(die)
set_color_map <-
c("white" = "black", "grey" = NA_character_,
"black" = NA_character_, "green" = NA_character_,
"teal1" = NA_character_, "teal2" = NA_character_,
"mushroom" = NA_character_, "aubergine" = NA_character_,
"ocean" = NA_character_, "ice" = NA_character_)
comparison_plots <-
lapply(1:nrow(comparison_plots_df), function(i) {
df_ <- comparison_plots_df[i,]
summary_data <- df_$data
results_data <-
lapply(summary_data, `[[`, "data") %>%
bind_rows() %>%
mutate(set = rep(summary_data[[1]]$set, times = summary_data[[1]]$n_obs))
x_vals <- unique(results_data$result)
x_vals <- x_vals[order(x_vals)]
ggplot(results_data, aes(x = result, fill = set, color = set)) +
geom_hline(data = summary_data[[1]],
aes(yintercept = expected_count_per_side),
linetype = "dashed") +
geom_bar(position = position_dodge(width = 0.7)) +
labs(title = paste("Observed Counts for", df_$die, "Rolls by Set"),
subtitle = "Horizontal line: expected roll count (fair die)") +
scale_x_continuous(breaks = x_vals, minor_breaks = FALSE) +
scale_color_manual(values = set_color_map, guide = "none") +
scale_fill_manual(values = set_fill_map) +
theme_bw()
})# simulating p-values from subsampling observed rolls - checking if
# stat. sig. p-vals are more a result of large sample size
sample_chisq_pval <-
function(data, subset_var, value_var, nsim = 1e4, n = 500, seed = 42) {
p_val_vec <- vector("numeric", nsim)
data_sets <- lapply(1:length(unique(data[[subset_var]])), function(i) {
set <- data[data[[subset_var]] == unique(data[[subset_var]])[i], "set"][1]
data <- data[data[[subset_var]] == unique(data[[subset_var]])[i],
value_var]
list("set" = set, "data" = data)
})
p_val_mat <-
matrix(nrow = nsim, ncol = length(data_sets) + 1,
dimnames = list(NULL,
c("seed",
vapply(data_sets, `[[`, character(1L), 1)))
)
for (i in 1:nrow(p_val_mat)) {
for (j in 2:ncol(p_val_mat)) {
set.seed(seed + i)
p_val_mat[i, 1] <- seed + i
p_val_mat[i, j] <-
chisq.test(
table(
sample(data_sets[[j - 1]]$data, size = n, replace = FALSE)
)
)$p.value
}
}
p_val_mat
}
d20_sample_sim_n1000 <-
sample_chisq_pval(
data = dice_roll_df[dice_roll_df$die == "d20", c("set", "result")],
subset_var = "set",
value_var = "result",
nsim = 5e3, n = 1000, seed = 42
)
d20_sample_sim_n750 <-
sample_chisq_pval(
data = dice_roll_df[dice_roll_df$die == "d20", c("set", "result")],
subset_var = "set",
value_var = "result",
nsim = 5e3, n = 750, seed = 42
)
d20_sample_sim_n500 <-
sample_chisq_pval(
data = dice_roll_df[dice_roll_df$die == "d20", c("set", "result")],
subset_var = "set",
value_var = "result",
nsim = 5e3, n = 500, seed = 42
)
d20_sample_sim_n1000_df <-
apply(d20_sample_sim_n1000, 2, quantile, probs = c(0.10, 0.50, 0.90)) %>%
as.data.frame() %>%
select(-seed) %>%
mutate(quantile = rownames(.)) %>%
pivot_longer(cols = all_of(sets), names_to = "set", values_to = "value") %>%
mutate(n = 1000) %>%
pivot_wider(names_from = "quantile", values_from = "value")
d20_sample_sim_n750_df <-
apply(d20_sample_sim_n750, 2, quantile, probs = c(0.10, 0.50, 0.90)) %>%
as.data.frame() %>%
select(-seed) %>%
mutate(quantile = rownames(.)) %>%
pivot_longer(cols = all_of(sets), names_to = "set", values_to = "value") %>%
mutate(n = 750) %>%
pivot_wider(names_from = "quantile", values_from = "value")
d20_sample_sim_n500_df <-
apply(d20_sample_sim_n500, 2, quantile, probs = c(0.10, 0.50, 0.90)) %>%
as.data.frame() %>%
select(-seed) %>%
mutate(quantile = rownames(.)) %>%
pivot_longer(cols = all_of(sets), names_to = "set", values_to = "value") %>%
mutate(n = 500) %>%
pivot_wider(names_from = "quantile", values_from = "value")
rm(d20_sample_sim_n1000, d20_sample_sim_n750, d20_sample_sim_n500)
merge_df <- function(df_list) {
Reduce(function(x, y) merge(x, y, all = TRUE), df_list)
}
d20_sample_sim_df <-
merge_df(list(d20_sample_sim_n500_df,
d20_sample_sim_n750_df,
d20_sample_sim_n1000_df))
write.csv(x = d20_sample_sim_df,
file = "d20_sample_sim.csv",
row.names = FALSE)d20_sample_sim_df <-
read.csv("d20_sample_sim.csv")
set_d20_ranks <-
d20_sample_sim_df |>
group_by(set) |>
summarize(X50. = mean(X50.), .groups = "drop") |>
arrange(desc(X50.)) |>
pull(set)
set_d20_ranks_abbrevmap <-
c("aubergine" = "aub",
"ice" = "ice",
"grey" = "gry",
"teal2" = "tl2",
"ocean" = "ocn",
"mushroom" = "msh",
"teal1" = "tl1",
"black" = "blk",
"white" = "wht",
"green" = "grn")
d20_sample_sim_df |>
mutate(set = factor(set, levels = set_d20_ranks)) |>
ggplot(aes(x = set, ymin = X10., y = X50., ymax = X90., color = set)) +
geom_hline(yintercept = 0.05, linetype = "dashed") +
geom_pointrange() +
facet_wrap(facets = vars(n), ncol = 2) +
labs(title = "p-values of Chi-square test of d20 by set",
subtitle = "Summary of 1,000 samples of size 500 / 1000",
y = "Chi Square p-value",
caption = "Sampling without replacement; 10/50/90% p-value summary; ref. line at 0.05") +
scale_color_manual(values = set_fill_map, guide = "none") +
theme_bw() +
theme(panel.grid.major.x = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))# brms note for Dirichlet model:
#https://discourse.mc-stan.org/t/dirichlet-regresion-using-brms/8591
set.seed(42)
d20_bayesian_pcts_df <-
data.frame(
set = sets,
die = "d20"
) %>%
left_join(
y = dice_roll_df_ %>%
filter(die == "d20") %>%
select(set, die, results = result_tbl),
by = c("set", "die")
) %>%
mutate(
thetas_df = lapply(1:nrow(.), function(i) {
set_ <- .$set[i]
die_ <- .$die[i]
sim_df <-
rdirichlet(n = 1e3, par = unlist(unname(.$results[i])) + 1) %>%
as.data.frame()
colnames(sim_df) <- paste0("_", 1:20)
sim_df %>%
pivot_longer(cols = everything(), names_to = "side", values_to = "value",
names_prefix = "_") %>%
mutate(set = set_, die = die_)
})
)
d20_bayesian_pcts_df <-
lapply(1:nrow(d20_bayesian_pcts_df), function(i) {
d20_bayesian_pcts_df$thetas_df[[i]]
}) %>%
bind_rows() %>%
mutate(side = factor(side, levels = as.character(1:20)))
d20_bayesian_pcts_df$iter <-
rep(
rep(1:(nrow(d20_bayesian_pcts_df) /
(20 * length(unique(d20_bayesian_pcts_df$set)))),
each = 20),
times = length(unique(d20_bayesian_pcts_df$set))
)
set_lntyp_map <-
c("white" = "solid", "grey" = "solid",
"black" = "solid", "green" = "solid",
"teal1" = "solid", "teal2" = "solid",
"mushroom" = "solid", "aubergine" = "dashed",
"ocean" = "solid", "ice" = "solid")
d20_bayesian_pcts_plot_top5 <-
d20_bayesian_pcts_df |>
filter(set %in% set_d20_ranks[1:5]) |>
ggplot(aes(x = value, color = set, linetype = set)) +
facet_wrap(facets = vars(side), ncol = 4) +
geom_vline(xintercept = 1 / 20, linetype = "dashed") +
geom_density(linewidth = 0.8) +
labs(title = "Bayesian posterior density of d20 roll probabilities by die set (top 5)",
subtitle = "uniform Dirichlet prior, 1851 rolls per die",
caption = "Reference line at 5%") +
scale_x_continuous(labels = scales::percent) +
scale_color_manual(values = set_fill_map) +
scale_linetype_manual(values = set_lntyp_map) +
theme_bw()
d20_bayesian_pcts_plot_nottop5 <-
d20_bayesian_pcts_df |>
filter(!(set %in% set_d20_ranks[1:5])) |>
ggplot(aes(x = value, color = set, linetype = set)) +
facet_wrap(facets = vars(side), ncol = 4) +
geom_vline(xintercept = 1 / 20, linetype = "dashed") +
geom_density(linewidth = 0.8) +
labs(title = "Bayesian posterior density of d20 roll probabilities by die set (not top 5)",
subtitle = "uniform Dirichlet prior, 1851 rolls per die",
caption = "Reference line at 5%") +
scale_x_continuous(labels = scales::percent) +
scale_color_manual(values = set_fill_map) +
scale_linetype_manual(values = set_lntyp_map) +
theme_bw()Top 5 d20 sets
Remaining d20 sets
d20_bayesian_credint_df <-
d20_bayesian_pcts_df |>
group_by(set, side) |>
summarize(
credint_pct2.5 = quantile(value, 0.025),
credint_pct50 = median(value),
credint_pct97.5 = quantile(value, 0.975),
.groups = "drop"
)
d20_bayesian_credint_df <-
mutate(d20_bayesian_credint_df,
set_abbrev = vapply(1:nrow(d20_bayesian_credint_df), function(i) {
set_d20_ranks_abbrevmap[grep(d20_bayesian_credint_df$set[i],
names(set_d20_ranks_abbrevmap))]
}, character(1L))
) |>
mutate(set_abbrev = factor(set_abbrev, levels = unname(set_d20_ranks_abbrevmap)),
set = factor(set, levels = names(set_d20_ranks_abbrevmap)))
# plotting the credible intervals
d20_bayesian_credint_df |>
ggplot(aes(x = set_abbrev,
ymin = credint_pct2.5, y = credint_pct50, ymax = credint_pct97.5,
color = set)) +
facet_wrap(facets = vars(side), ncol = 4) +
geom_hline(yintercept = 1 / 20, linetype = "dashed") +
geom_pointrange(linewidth = 1) +
labs(title = "d20 95% Credible Intervals with Median by Set and Side",
y = "Posterior Probability (%)",
caption = "Reference line at 5%") +
scale_y_continuous(labels = scales::percent) +
scale_color_manual(values = set_fill_map) +
theme_bw() +
theme(axis.text.x = element_text(angle = 40, hjust = 1, vjust = 1))# deep dive: posterior probability 1 vs. 20
d20_bayesian_credint_df |>
filter(side %in% c("1", "20")) |>
ggplot(aes(x = set,
ymin = credint_pct2.5, y = credint_pct50, ymax = credint_pct97.5,
color = set, group = side)) +
geom_hline(yintercept = 1 / 20, linetype = "dashed") +
geom_pointrange(fatten = 10, shape = 22, linewidth = 1, fill = "white",
position = position_dodge(width = 0.5)) +
geom_text(aes(label = side), size = 3, color = "black",
position = position_dodge(width = 0.5), vjust = 0.5) +
labs(title = "d20 95% Credible Intervals with Median by Set and Side",
subtitle = "Zoom in for P(1) and P(20)",
y = "Posterior Probability (%)",
caption = "Reference line at 5%") +
scale_y_continuous(labels = scales::percent) +
scale_color_manual(values = set_fill_map, guide = "none") +
theme_bw()# probability 1-3 vs 18-20
btm_top <- 3
d20_bayesian_btm_df <-
d20_bayesian_pcts_df |>
filter(side %in% c(1:btm_top)) |>
group_by(set, iter) |>
summarize(value = sum(value), .groups = "drop") |>
mutate(grp = paste("1 to", btm_top)) |>
group_by(grp, set) |>
summarize(
credint_pct2.5 = quantile(value, 0.025),
credint_pct50 = median(value),
credint_pct97.5 = quantile(value, 0.975),
.groups = "drop"
)
d20_bayesian_top_df <-
d20_bayesian_pcts_df |>
filter(side %in% c((20 - btm_top + 1):20)) |>
group_by(set, die, iter) |>
summarize(value = sum(value), .groups = "drop") |>
mutate(grp = paste((20 - btm_top + 1), "to 20")) |>
group_by(grp, set) |>
summarize(
credint_pct2.5 = quantile(value, 0.025),
credint_pct50 = median(value),
credint_pct97.5 = quantile(value, 0.975),
.groups = "drop"
)
d20_bayesian_lohi_df <-
full_join(d20_bayesian_btm_df, d20_bayesian_top_df,
by = intersect(colnames(d20_bayesian_btm_df),
colnames(d20_bayesian_top_df)))
d20_bayesian_lohi_df |>
mutate(set = factor(set, levels = names(set_d20_ranks_abbrevmap))) |>
ggplot(aes(x = set,
ymin = credint_pct2.5, y = credint_pct50, ymax = credint_pct97.5,
color = set, shape = grp)) +
geom_hline(yintercept = btm_top / 20, linetype = "dashed") +
geom_pointrange(fatten = 5, linewidth = 1, fill = "white",
position = position_dodge(width = 0.5)) +
labs(title = "d20 95% Credible Intervals with Median by Set and Side",
subtitle = paste("Zoom in for lowest and highest", btm_top, "sides"),
y = "Posterior Probability (%)",
caption = paste("Reference line at", paste0(btm_top / 20 * 100, "%")),
shape = NULL) +
scale_y_continuous(labels = scales::percent) +
scale_color_manual(values = set_fill_map, guide = "none") +
scale_shape_manual(values = c(25, 24)) +
theme_bw()Dynamic Shiny app
Note: this was an experiment in embedding a Shiny app in an R Markdown file; the HTML file needs to be rendered locally from the .Rmd file for this to work. IF DOING THIS, ADD ‘runtime: shiny’ AT THE LAST LINE OF THE .RMD YAML HEADER
# Shiny app radar plot for median posterior probability of sides per d20
shinyApp(
ui = fluidPage(
selectInput("set", "Dice set(s) to emphasize:",
choices = unique(d20_bayesian_pcts_df$set)[order(unique(d20_bayesian_pcts_df$set))],
selected = unique(d20_bayesian_pcts_df$set)[order(unique(d20_bayesian_pcts_df$set))][1],
multiple = TRUE),
checkboxInput("arrange_desc", "Arrange sides by median posterior probability"),
plotOutput("radar_plot", width = "100%")
),
server = function(input, output) {
median_prob_df <-
d20_bayesian_pcts_df |>
group_by(die, set, side) |>
summarize(median_post_prob = median(value), .groups = "drop")
sorted_side_order <-
reactive({
if (input$arrange_desc) {
median_prob_df |>
filter(set %in% input$set) |>
group_by(side) |>
summarize(mean_medn_prob = mean(median_post_prob),
.groups = "drop") |>
mutate(side = as.integer(side)) |>
arrange(desc(mean_medn_prob)) |>
pull(side)
} else { 1:20 }
})
medn_prob_df <-
reactive({
mutate(median_prob_df, side = factor(side, levels = sorted_side_order()))
})
emph_prob_df <-
reactive({
median_prob_df |>
filter(set %in% input$set) |>
mutate(side = factor(side, levels = sorted_side_order()))
})
output$radar_plot = renderPlot({
ggplot(data = emph_prob_df(), aes(x = side, y = median_post_prob, color = set, group = set)) +
geom_hline(yintercept = 1 / 20, linetype = "dashed") +
geom_line(data = medn_prob_df(), color = "grey") +
geom_point(data = medn_prob_df(), color = "grey") +
geom_line(data = emph_prob_df(), linewidth = 1) +
geom_point(data = emph_prob_df(), size = 3) +
labs(title = "Radar-like Plot of Median Posterior Probability of (side)",
subtitle = "All d20 in grey, selected set(s) in color",
x = NULL, y = NULL, color = NULL,
caption = "Reference line at 5%") +
scale_y_continuous(labels = scales::percent) +
scale_color_manual(values = set_fill_map) +
theme_bw() +
theme(axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10),
plot.caption = element_text(size = 10),
legend.text = element_text(size = 10)) +
coord_polar()
})
},
options = list(height = 550)
)Static Plots
mdn_post_prob_plts <-
lapply(seq_along(set_d20_ranks), function(i) {
median_prob_df <-
d20_bayesian_pcts_df |>
group_by(die, set, side) |>
summarize(median_post_prob = median(value), .groups = "drop")
sorted_side_order <-
median_prob_df |>
filter(set == set_d20_ranks[i]) |>
group_by(side) |>
summarize(mean_medn_prob = mean(median_post_prob),
.groups = "drop") |>
mutate(side = as.integer(side)) |>
arrange(desc(mean_medn_prob)) |>
pull(side)
medn_prob_df <-
mutate(median_prob_df, side = factor(side, levels = sorted_side_order))
emph_prob_df <-
median_prob_df |>
filter(set == set_d20_ranks[i]) |>
mutate(side = factor(side, levels = sorted_side_order))
ggplot(data = emph_prob_df, aes(x = side, y = median_post_prob, color = set, group = set)) +
geom_hline(yintercept = 1 / 20, linetype = "dashed") +
geom_line(data = medn_prob_df, color = "grey") +
geom_point(data = medn_prob_df, color = "grey") +
geom_line(data = emph_prob_df, linewidth = 1) +
geom_point(data = emph_prob_df, size = 3) +
labs(title = "Radar-like Plot of Median Posterior Probability of (side)",
subtitle = "All d20 in grey, selected set in color",
x = NULL, y = NULL, color = NULL,
caption = "Reference line at 5%") +
scale_y_continuous(labels = scales::percent) +
scale_color_manual(values = set_fill_map) +
theme_bw() +
coord_polar()
})
for (i in seq_along(mdn_post_prob_plts)) {
print(mdn_post_prob_plts[[i]])
}